From 1d1c34f3c83e12d65a3f18b0b748ddd9b0040ddf Mon Sep 17 00:00:00 2001
From: =?utf8?q?=C3=98yvind=20Kol=C3=A5s?= <pippin@gimp.org>
Date: Sat, 19 Nov 2016 18:27:06 +0100
Subject: [PATCH] make pow implementations inlineable - gaining some fixed
 amount of blitting performance

---
 babl/base/pow-24.c      |   3 +-
 babl/base/pow-24.h      | 118 ++++++++++++++++++++++++++++++++++++++--
 extensions/sse2-float.c |  14 ++---
 3 files changed, 123 insertions(+), 12 deletions(-)

diff --git a/babl/base/pow-24.c b/babl/base/pow-24.c
index 03fca43..dbc4071 100644
--- a/babl/base/pow-24.c
+++ b/babl/base/pow-24.c
@@ -18,7 +18,7 @@
 
 #include <math.h>
 #include "util.h"
-
+#if 0
 /* a^b = exp(b*log(a))
  *
  * Extracting the exponent from a float gives us an approximate log.
@@ -125,3 +125,4 @@ babl_pow_1_24f (float x)
     y = (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
   return x*y;
 }
+#endif
diff --git a/babl/base/pow-24.h b/babl/base/pow-24.h
index 2fb338f..cbd6fd4 100644
--- a/babl/base/pow-24.h
+++ b/babl/base/pow-24.h
@@ -19,9 +19,119 @@
 #ifndef _BASE_POW_24_H
 #define _BASE_POW_24_H
 
-extern double babl_pow_1_24 (double x);
-extern double babl_pow_24 (double x);
-extern float  babl_pow_1_24f (float x);
-extern float  babl_pow_24f (float x);
+static double babl_pow_1_24 (double x);
+static double babl_pow_24 (double x);
+static float  babl_pow_1_24f (float x);
+static float  babl_pow_24f (float x);
+
+
+/* a^b = exp(b*log(a))
+ *
+ * Extracting the exponent from a float gives us an approximate log.
+ * Or better yet, reinterpret the bitpattern of the whole float as an int.
+ *
+ * However, the output values of 12throot vary by less than a factor of 2
+ * over the domain we care about, so we only get log() that way, not exp().
+ *
+ * Approximate exp() with a low-degree polynomial; not exactly equal to the
+ * Taylor series since we're minimizing maximum error over a certain finite
+ * domain. It's not worthwhile to use lots of terms, since Newton's method
+ * has a better convergence rate once you get reasonably close to the answer.
+ */
+static inline double
+init_newton (double x, double exponent, double c0, double c1, double c2)
+{
+    int iexp;
+    double y = frexp(x, &iexp);
+    y = 2*y+(iexp-2);
+    c1 *= M_LN2*exponent;
+    c2 *= M_LN2*M_LN2*exponent*exponent;
+    return y = c0 + c1*y + c2*y*y;
+}
+
+/* Returns x^2.4 == (x*(x^(-1/5)))^3, using Newton's method for x^(-1/5).
+ */
+static inline double
+babl_pow_24 (double x)
+{
+  double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+  int i;
+  for (i = 0; i < 3; i++)
+    y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
+  x *= y;
+  return x*x*x;
+}
+
+/* Returns x^(1/2.4) == x*((x^(-1/6))^(1/2))^7, using Newton's method for x^(-1/6).
+ */
+static inline double
+babl_pow_1_24 (double x)
+{
+  double y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+  int i;
+  double z;
+  x = sqrt (x);
+  /* newton's method for x^(-1/6) */
+  z = (1./6.) * x;
+  for (i = 0; i < 3; i++)
+    y = (7./6.) * y - z * ((y*y)*(y*y)*(y*y*y));
+  return x*y;
+}
+
+//////////////////////////////////////////////
+/* a^b = exp(b*log(a))
+ *
+ * Extracting the exponent from a float gives us an approximate log.
+ * Or better yet, reinterpret the bitpattern of the whole float as an int.
+ *
+ * However, the output values of 12throot vary by less than a factor of 2
+ * over the domain we care about, so we only get log() that way, not exp().
+ *
+ * Approximate exp() with a low-degree polynomial; not exactly equal to the
+ * Taylor series since we're minimizing maximum error over a certain finite
+ * domain. It's not worthwhile to use lots of terms, since Newton's method
+ * has a better convergence rate once you get reasonably close to the answer.
+ */
+static inline float
+init_newtonf (float x, float exponent, float c0, float c1, float c2)
+{
+    int iexp;
+    float y = frexpf(x, &iexp);
+    y = 2*y+(iexp-2);
+    c1 *= M_LN2*exponent;
+    c2 *= M_LN2*M_LN2*exponent*exponent;
+    return y = c0 + c1*y + c2*y*y;
+}
+
+/* Returns x^2.4 == (x*(x^(-1/5)))^3, using Newton's method for x^(-1/5).
+ */
+static inline float
+babl_pow_24f (float x)
+{
+  float y = init_newtonf (x, -1.f/5, 0.9953189663f, 0.9594345146f, 0.6742970332f);
+  int i;
+  for (i = 0; i < 3; i++)
+    y = (1.f+1.f/5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
+  x *= y;
+  return x*x*x;
+}
+
+/* Returns x^(1/2.4) == x*((x^(-1/6))^(1/2))^7, using Newton's method for x^(-1/6).
+ */
+static inline float
+babl_pow_1_24f (float x)
+{
+  float y = init_newtonf (x, -1.f/12, 0.9976800269f, 0.9885126933f, 0.5908575383f);
+  int i;
+  float z;
+  x = sqrtf (x);
+  /* newton's method for x^(-1/6) */
+  z = (1.f/6.f) * x;
+  for (i = 0; i < 3; i++)
+    y = (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y));
+  return x*y;
+}
+
+
 
 #endif
diff --git a/extensions/sse2-float.c b/extensions/sse2-float.c
index 8148bf2..72463ee 100644
--- a/extensions/sse2-float.c
+++ b/extensions/sse2-float.c
@@ -244,7 +244,7 @@ conv_rgbAF_linear_rgbaF_linear_spin (const float *src, float *dst, long samples)
 #define FLT_MANTISSA (1<<23)
 
 static inline __v4sf
-init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
+sse_init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
 {
     double norm = exponent*M_LN2/FLT_MANTISSA;
     __v4sf y = _mm_cvtepi32_ps((__m128i)((__v4si)x - splat4i(FLT_ONE)));
@@ -252,10 +252,10 @@ init_newton (__v4sf x, double exponent, double c0, double c1, double c2)
 }
 
 static inline __v4sf
-pow_1_24 (__v4sf x)
+sse_pow_1_24 (__v4sf x)
 {
   __v4sf y, z;
-  y = init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
+  y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383);
   x = _mm_sqrt_ps (x);
   /* newton's method for x^(-1/6) */
   z = splat4f (1.f/6.f) * x;
@@ -265,10 +265,10 @@ pow_1_24 (__v4sf x)
 }
 
 static inline __v4sf
-pow_24 (__v4sf x)
+sse_pow_24 (__v4sf x)
 {
   __v4sf y, z;
-  y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+  y = sse_init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
   /* newton's method for x^(-1/5) */
   z = splat4f (1.f/5.f) * x;
   y = splat4f (6.f/5.f) * y - z * ((y*y*y)*(y*y*y));
@@ -280,7 +280,7 @@ pow_24 (__v4sf x)
 static inline __v4sf
 linear_to_gamma_2_2_sse2 (__v4sf x)
 {
-  __v4sf curve = pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f);
+  __v4sf curve = sse_pow_1_24 (x) * splat4f (1.055f) - splat4f (0.055f);
   __v4sf line = x * splat4f (12.92f);
   __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.003130804954f));
   return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
@@ -289,7 +289,7 @@ linear_to_gamma_2_2_sse2 (__v4sf x)
 static inline __v4sf
 gamma_2_2_to_linear_sse2 (__v4sf x)
 {
-  __v4sf curve = pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
+  __v4sf curve = sse_pow_24 ((x + splat4f (0.055f)) * splat4f (1/1.055f));
   __v4sf line = x * splat4f (1/12.92f);
   __v4sf mask = _mm_cmpgt_ps (x, splat4f (0.04045f));
   return _mm_or_ps (_mm_and_ps (mask, curve), _mm_andnot_ps (mask, line));
-- 
2.30.2